knitr::include_graphics("q1.jpeg")
$$ \[\begin{align} &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \lambda∫[g^{(m)}(x)]^2 dx) \\\\ &a). \lambda = \infty, m = 0 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \infty∫[g(x)]^2 dx) \\ &\text{the penalty is extremely significant when } \lambda = \infty. \text{ When m = 0, } g^0=g, \\ &\text{thus to min the RSS we have to set g(x) = 0} \\\\ &b). \lambda = \infty, m = 1 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \infty∫[g'(x)]^2 dx) \\ &\text{the penalty is extremely significant when } \lambda = \infty. \text{ When m = 1, } g^1=g', \\ &\text{thus to min the RSS we have to set g'(x) = 0 and g(x) = constant} \\\\ &c). \lambda = \infty, m = 2 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \infty∫[g''(x)]^2 dx) \\ &\text{the penalty is extremely significant when } \lambda = \infty. \text{ When m = 2, } g^2=g'',\\ &\text{thus to min the RSS we have to set g''(x) = 0 and g(x) is linear function } = ax + b \\\\ &d). \lambda = \infty, m = 3 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + \infty∫[g'''(x)]^2 dx) \\ &\text{the penalty is extremely significant when } \lambda = \infty. \text{ When m = 3, } g^3=g''', \\ &\text{thus to min the RSS we have to set g'''(x) = 0 and g(x) is quadratic function } = ax^2 + bx +c \\\\ &e).\lambda = 0, m = 3 \\ &g = argmin_g(\sum_{i=1}^{n}(y_i −g(x_i))^2 + 0*∫[g''(x)]^2 dx) \\ &\text{the penalty is 0 when} \lambda = 0. \text{ When m = 3, } g^3=g''',\text{thus we can have g(x) interpolate all samples and have RSS = 0 } \\ \end{align}\] $$
$$ \[\begin{align} &\hat{f} (X)=2+3I(0≤X≤2)−3(X+1)I(1≤X≤2)-2(2X −2)I(3 ≤ X ≤ 4) + 2I(4<X≤5) \\\\ &\text{When } -2 \leq X < 0 \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*0 + 2*0 = 2 \\ &\text{None of the indicator variables are true; the slope is 0; the intecept is 2.} \\ \\ &\text{When 0} \leq X \leq 1: \hat{f}(X) = 2 + 3*1 -3(X+1)*0 - 2(2X-2)*0 + 2*0 = 5 \\ &\text{The indicator variables I(0≤X≤2) are true; the slope is 0; the intecept is 5.} \\\\ &\text{When } 1 < X \leq 2: \hat{f}(X) = 2 + 3*1 -3(X+1)*1 - 2(2X-2)*0 + 2*0 =-3X+2 \\ &\text{The indicator variables I(1≤X≤2) are true; the slope is -3; the intecept is 2.} \\\\ &\text{When } 2 < X < 3: \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*0 + 2*0 = 2 \\ &\text{None of the indicator variables are true; the slope is 0; the intecept is 2.} \\\\ &\text{When } 3 \leq X \leq 4: \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*1 + 2*0 = -4X + 6 \\ &\text{The indicator variables I(3 ≤ X ≤ 4) are true; the slope is -4; the intecept is 6.} \\\\ &\text{When } 4 < X \leq 5: \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*0 + 2*1 = 4 \\ &\text{The indicator variables I(4<X≤5) are true; the slope is 0; the intecept is 4.} \\\\ &\text{When } 5 < X \leq 6: \hat{f}(X) = 2 - 3*0 -3(X+1)*0 - 2(2X-2)*0 + 2*0 = 2 \\ &\text{None of the indicator variables are true; the slope is 0; the intecept is 2.} &\end{align}\] $$
knitr::include_graphics("q2.jpeg")
$$ \[\begin{align} &f(X) = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \beta_4(X − \psi)_+^3 \\\\ &a). \\ &\text{If for all } X > \psi, f(X) = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \beta_4(X − \psi)_+^3\\ &(X − \psi)_+^3 = (X − \psi)^3, X > \psi \\ &f(X) = f_1(X) = (\beta_0 − \beta_4\psi^3) + (\beta_1 + 3\psi^2\beta_4)X + (\beta_2 − 3\beta_4\psi)X^2 + (\beta_3 + \beta_4)X^3 \\ &f(X) \text{ is a cubic polynomial} \\\\ &b). \\ &\text{If for all } X \leq \psi \\ &(X − \psi)_+^3 = 0, \\ & f(X) = f_2(X) = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 + \beta_4 * 0 \\ & f_2(X) = \beta_0 + \beta_1X + \beta_2X^2 + \beta_3X^3 \\ &f(X) \text{ is a cubic polynomial} \\\\ &c). \\ &\text{To show f(X) is continuous at } X=\psi \\ &f_1(X) = (\beta_0 − \beta_4\psi^3) + (\beta_1 + 3\psi^2\beta_4)X + (\beta_2 − 3\beta_4\psi)X^2 + (\beta_3 + \beta_4)X^3 \\ &f_1(\psi) = (\beta_0 − \beta_4\psi^3) + (\beta_1 + 3\psi^2\beta_4)\psi + (\beta_2 − 3\beta_4\psi)\psi^2 + (\beta_3 + \beta_4)\psi^3 \\ &f_1(\psi) = \beta_0 − \beta_4\psi^3 + \beta_1\psi + 3\psi^3\beta_4 + \beta_2\psi^2 − 3\beta_4\psi^3 + \beta_3\psi^3 + \beta_4\psi^3 \\ &f_1(\psi) = \beta_0 + \beta_1\psi + \beta_2\psi^2 + \beta_3\psi^3 = f_2(\psi) \\ &\text{f(X) is continuous at } X=\psi \\\\ &d). \\ &\text{To show }f'(X) \text{ is continuous at } X=\psi \\ &f_1'(\psi) = \beta_1 + 2\beta_2\psi + 3\beta_3\psi^2 = f_2'(\psi) \\ &f'(X)\text{ is continuous at } X=\psi \\\\ &e). \\ &\text{To show }f''(X) \text{ is continuous at } X=\psi \\ &f_1''(\psi) = 2\beta_2 + 6\beta_3\psi = f_2'(\psi) \\ &f''(X)\text{ is continuous at } X=\psi \\\\ &\text{Thus f(X) is a cubic spine regardless of the value of the } \beta \end{align}\] $$
library(ISLR2)
attach(Wage)
head(Wage)
## year age maritl race education region
## 231655 2006 18 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic
## 86582 2004 24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 161300 2003 45 2. Married 1. White 3. Some College 2. Middle Atlantic
## 155159 2003 43 2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 11443 2005 50 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic
## 376662 2008 54 2. Married 1. White 4. College Grad 2. Middle Atlantic
## jobclass health health_ins logwage wage
## 231655 1. Industrial 1. <=Good 2. No 4.318063 75.04315
## 86582 2. Information 2. >=Very Good 2. No 4.255273 70.47602
## 161300 1. Industrial 1. <=Good 1. Yes 4.875061 130.98218
## 155159 2. Information 2. >=Very Good 1. Yes 5.041393 154.68529
## 11443 2. Information 1. <=Good 1. Yes 4.318063 75.04315
## 376662 2. Information 2. >=Very Good 1. Yes 4.845098 127.11574
set.seed(2)
train <- sample(1:nrow(Wage), nrow(Wage) / 2)
Wage.train <- Wage[train,]
Wage.test <- Wage[-train, ]
# a).polynomial
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:mosaic':
##
## logit
## The following object is masked from 'package:lattice':
##
## melanoma
set.seed(1)
mse_poly <- rep(0, 10)
for (i in c(1:10)){
glm.fit <- glm(wage ~ poly(age, i), data = Wage)
mse_poly[i] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
bestdeg<- which.min(mse_poly)
print(paste("Best degree using 10-fold CV is:", bestdeg))
## [1] "Best degree using 10-fold CV is: 9"
plot(x = c(1:10), y = mse_poly, type = "l",
xlab = "Degree of Poly",
ylab = "10-fold CV MSE", main = "10-fold CV MSE for polynomial model with different degrees")
points(x = c(1:10), y = mse_poly)
library(repr)
fit <- lm(wage ~ poly(age, 9), data=Wage, subset = train)
round(coef(summary(fit)), 3)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112.247 1.034 108.506 0.000
## poly(age, 9)1 500.884 56.741 8.828 0.000
## poly(age, 9)2 -422.836 57.265 -7.384 0.000
## poly(age, 9)3 106.645 56.181 1.898 0.058
## poly(age, 9)4 -106.821 58.092 -1.839 0.066
## poly(age, 9)5 6.480 57.963 0.112 0.911
## poly(age, 9)6 66.865 57.633 1.160 0.246
## poly(age, 9)7 -13.766 57.853 -0.238 0.812
## poly(age, 9)8 -47.206 57.336 -0.823 0.410
## poly(age, 9)9 -95.484 56.694 -1.684 0.092
# polynomial plot with test data
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds_poly <- predict(fit, newdata=list(age=age.grid), se=TRUE)
se.bands <- cbind(preds_poly$fit + 2 * preds_poly$se.fit,
preds_poly$fit - 2 * preds_poly$se.fit)
options(repr.plot.width=12, repr.plot.height=6)
plot(age, wage, xlim=agelims, cex=.5, col="darkgrey", xlab = "age", ylab = "wage")
title("Degree-9 Polynomial")
lines(age.grid, preds_poly$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)
print(paste0("the mse is " , mean((predict(fit, data.frame(age = Wage.test$age)) - Wage.test$wage)^2)))
## [1] "the mse is 1598.00616865424"
The MSE of degree 9 polynomial is 1598. I used 10-fold cv to find the degree of polynomials with smallest MSE. The degree is 9. The plot is based on all data. The confidence band is quite obvious on all domain of age especially on the two ends.
# b). stepwise function
fit <- lm(wage ~ cut(age, 7), data=Wage, subset = train)
round(coef(summary(fit)), 3)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.177 3.423 23.424 0.000
## cut(age, 7)(26.9,35.7] 24.950 4.100 6.085 0.000
## cut(age, 7)(35.7,44.6] 37.189 3.995 9.308 0.000
## cut(age, 7)(44.6,53.4] 38.083 3.976 9.577 0.000
## cut(age, 7)(53.4,62.3] 43.636 4.338 10.059 0.000
## cut(age, 7)(62.3,71.1] 29.636 7.132 4.155 0.000
## cut(age, 7)(71.1,80.1] 26.410 12.555 2.103 0.036
agelims <- range(age)
age.grid <- seq(from = agelims[1], to = agelims[2])
preds_cut <- predict(fit, newdata=list(age=age.grid), se=TRUE)
se.bands <- cbind(preds_cut$fit + 2 * preds_cut$se.fit,
preds_cut$fit - 2 * preds_cut$se.fit)
options(repr.plot.width=12, repr.plot.height=6)
plot(age, wage, xlim=agelims, cex=.5, col="darkgrey", xlab = "age", ylab = "wage")
title("7-cut stepwise function")
lines(age.grid, preds_cut$fit, lwd=2, col="blue")
matlines(age.grid, se.bands, lwd=1, col="blue", lty=3)
print(paste0("the mse is " , mean((predict(fit, data.frame(age = Wage.test$age)) - Wage.test$wage)^2)))
## [1] "the mse is 1620.41538330396"
The MSE for stepwise function is 1620.4153. I use 7 cut in this case. We can see the confidence band is getting large when the domain of age is greater than 60. Comparing the polynomials, it has better confidence band when the age is small.
# c). piece-wise function
grid1 <- seq(from = agelims[1], to = 33.5)
grid2 <- seq(from = 33.5, to = 49)
grid3 <- seq(from = 49, to = 64.5)
grid4 <- seq(from = 64.5, to = agelims[2])
pred1 <- predict(lm(wage ~ poly(age, 3), data = Wage.train[Wage.train$age < 33.5,]), newdata = list(age = grid1), se = TRUE)
bands1 <- cbind(pred1$fit + 2* pred1$se.fit,
pred1$fit - 2 * pred1$se.fit)
pred2 <- predict(lm(wage ~ poly(age, 3), data = Wage.train[Wage.train$age > 33.5 & Wage.train$age <= 49,]), newdata = list(age = grid2), se = TRUE)
bands2 <- cbind(pred2$fit + 2* pred2$se.fit,
pred2$fit - 2 * pred2$se.fit)
pred3 <- predict(lm(wage ~ poly(age, 3), data = Wage.train[Wage.train$age > 49 & Wage.train$age <= 64.5,]), newdata = list(age = grid3), se = TRUE)
bands3 <- cbind(pred3$fit + 2* pred3$se.fit,
pred3$fit - 2 * pred3$se.fit)
pred4 <- predict(lm(wage ~ poly(age, 3), data = Wage.train[Wage.train$age > 64.5,]), newdata = list(age = grid4), se = TRUE)
bands4 <- cbind(pred4$fit + 2* pred4$se.fit,
pred4$fit - 2 * pred4$se.fit)
plot(age, wage, xlim = agelims, cex = .5, col = "darkgrey", xlab = "age", ylab = "wage", main = "piecewise function")
matlines(grid1, bands1, lwd=1, col="blue", ity = "dashed")
matlines(grid2, bands2, lwd=1, col="blue", ity = "dashed")
matlines(grid3, bands3, lwd=1, col="blue", ity = "dashed")
matlines(grid4, bands4, lwd=1, col="blue", ity = "dashed")
lines(grid1, pred1$fit, lwd=2, col="blue")
lines(grid2, pred2$fit, lwd=2, col="blue")
lines(grid3, pred3$fit, lwd=2, col="blue")
lines(grid4, pred4$fit, lwd=2, col="blue")
I fit the piecewise function with three knots: 33.5, 49, 64.5 for four intervals with degree of 3. The function is not continuous. The confidence bond is getting wider at the fourth interval which is for age greater than 64.5.
# d). cubic spline
library(splines)
fit <- lm(wage ~ bs(age, knots = c(25, 40, 60)), data = Wage, subset = train)
pred_bs <- predict(fit, newdata = list(age = age.grid), se = T)
summary(fit)
##
## Call:
## lm(formula = wage ~ bs(age, knots = c(25, 40, 60)), data = Wage,
## subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -98.088 -24.749 -4.806 14.801 199.796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.0395 15.1852 4.020 6.12e-05 ***
## bs(age, knots = c(25, 40, 60))1 0.9202 19.6111 0.047 0.96258
## bs(age, knots = c(25, 40, 60))2 49.2776 15.1583 3.251 0.00118 **
## bs(age, knots = c(25, 40, 60))3 54.6686 17.0136 3.213 0.00134 **
## bs(age, knots = c(25, 40, 60))4 66.9080 16.5526 4.042 5.57e-05 ***
## bs(age, knots = c(25, 40, 60))5 45.7181 21.8860 2.089 0.03688 *
## bs(age, knots = c(25, 40, 60))6 31.0702 27.7077 1.121 0.26232
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.94 on 1493 degrees of freedom
## Multiple R-squared: 0.08677, Adjusted R-squared: 0.0831
## F-statistic: 23.64 on 6 and 1493 DF, p-value: < 2.2e-16
print(paste0("the mse is " , mean((predict(fit, data.frame(age = Wage.test$age)) - Wage.test$wage)^2)))
## [1] "the mse is 1600.354279369"
par(mfrow=c(1, 2))
plot(age, wage, col="gray", main="Cubic regression spline", cex=.5, xlab = "age", ylab = "wage")
lines(age.grid, pred_bs$fit, lwd=2)
lines(age.grid, pred_bs$fit + 2 * pred_bs$se, lty="dashed")
lines(age.grid, pred_bs$fit - 2 * pred_bs$se, lty="dashed")
abline(v=25, lty="dashed", col="blue")
abline(v=40, lty="dashed", col="blue")
abline(v=60, lty="dashed", col="blue")
For cubic spline, I used the knots (25, 40, 60). The confidence bonds on two ends are getting wider as well. The MSE is 1600.3543
# e). smoothing spline
fit <- smooth.spline(Wage.train$age, Wage.train$wage, cv = TRUE)
## Warning in smooth.spline(Wage.train$age, Wage.train$wage, cv = TRUE): cross-
## validation with non-unique 'x' values seems doubtful
fit2 <- smooth.spline(Wage.train$age, Wage.train$wage, df = 5)
summary(fit)
## Length Class Mode
## x 61 -none- numeric
## y 61 -none- numeric
## w 61 -none- numeric
## yin 61 -none- numeric
## tol 1 -none- numeric
## data 3 -none- list
## no.weights 1 -none- logical
## n 1 -none- numeric
## lev 61 -none- numeric
## cv 1 -none- logical
## cv.crit 1 -none- numeric
## pen.crit 1 -none- numeric
## crit 1 -none- numeric
## df 1 -none- numeric
## spar 1 -none- numeric
## ratio 1 -none- numeric
## lambda 1 -none- numeric
## iparms 5 -none- numeric
## auxM 0 -none- NULL
## fit 5 smooth.spline.fit list
## call 4 -none- call
plot(age, wage, xlim = agelims, cex=.5, col="darkgrey", main="Smoothing Spline", xlab = "age", ylab = "wage")
lines(fit, col = "blue", lwd = 2)
lines(fit2, col = "red", lwd = 2)
I choose df = 5 and fit into the training data. We could see the red line represents the df = 5 and the blue line is the default. The two lines match with each other very well, which is much better than the previous models. I think this one yielding the best result on the test set.
#Auto
df <- subset(Auto, select = -name)
head(df)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
set.seed(1)
train <- sample(1: nrow(df), nrow(df) / 2)
auto.test <- df[-train, "mpg"]
tree.auto <- tree(mpg ~ ., df, subset = train, control = tree.control(nobs = length(train), mindev = 0))
summary(tree.auto)
##
## Regression tree:
## tree(formula = mpg ~ ., data = df, subset = train, control = tree.control(nobs = length(train),
## mindev = 0))
## Variables actually used in tree construction:
## [1] "displacement" "horsepower" "year" "acceleration" "weight"
## Number of terminal nodes: 32
## Residual mean deviance: 6.199 = 1017 / 164
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -7.16700 -1.18800 -0.04833 0.00000 0.96790 14.13000
#big tree
plot(tree.auto)
text(tree.auto, pretty = 0)
#pruned tree
prune.auto <- prune.tree(tree.auto, best = 7) # pruned tree
plot(prune.auto)
text(prune.auto , pretty = 0)
tree.pred <- predict(tree.auto, newdata = df[-train,])
plot(tree.pred, auto.test, xlab = "prediction", ylab = "value in test set")
abline(0,1)
mean((tree.pred - auto.test)^2) #the test MSE for unpruned tree
## [1] 9.366727
for (i in 2:10) {
prune.auto <- prune.tree(tree.auto, best = i)
prune.pred <- predict(prune.auto, newdata = df[-train,])
MSE <- round(mean((prune.pred - auto.test)^2), 4)
print(paste0("The MSE is ", MSE, " when best is ", i)) #the test MSE for pruned tree
}
## [1] "The MSE is 26.0513 when best is 2"
## [1] "The MSE is 20.0664 when best is 3"
## [1] "The MSE is 18.7148 when best is 4"
## [1] "The MSE is 16.6175 when best is 5"
## [1] "The MSE is 13.306 when best is 6"
## [1] "The MSE is 12.8271 when best is 7"
## [1] "The MSE is 12.6116 when best is 8"
## [1] "The MSE is 11.578 when best is 9"
## [1] "The MSE is 11.6808 when best is 10"
The MSE for unpruned tree is 9.366727. The MSE for pruned tree with size from 2-10 are 26.0513, 20.0664, 18.7148, 16.6175, 13.306, 12.8271, 12.6116, 11.578, 11.6808 respectively, which are all higher than the unpruned tree. The best size is 9 in this case.
set.seed(1)
bag.auto <- randomForest(mpg ~ ., data = df, subset = train, mtry = 7, importance = TRUE)
bag.auto
##
## Call:
## randomForest(formula = mpg ~ ., data = df, mtry = 7, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 7
##
## Mean of squared residuals: 9.481201
## % Var explained: 85.59
bag.pred <- predict(bag.auto, newdata = df[-train,])
plot(bag.pred, auto.test, xlab = "bagging prediction", ylab = "value in test set")
abline(0, 1)
mean((bag.pred - auto.test)^2)
## [1] 7.023526
The MSE for bagging is 7.023526. The tuning parameters is m=p=7 for regression tree.
set.seed(1)
rf.auto <- randomForest(mpg ~ ., data = df,
subset = train, mtry = 7/3, importance = T)
yhat.rf <- predict(rf.auto, newdata = df[-train, ])
mean((yhat.rf - auto.test)^2)
## [1] 6.958561
varImpPlot(rf.auto)
plot(yhat.rf, auto.test, xlab = "rf prediction", ylab = "value in test set")
abline(0, 1)
The MSE is 6.9586. The tuning parameter is m=p/3 = 7/3 since it’s for regression.
library(gam)
## Loading required package: foreach
## Loaded gam 1.20.1
gam.m <- gam(mpg ~ s(cylinders) + s(displacement) + s(horsepower) + s(weight) + s(acceleration) + s(year) + origin, data = df, subset = train)
summary(gam.m)
##
## Call: gam(formula = mpg ~ s(cylinders) + s(displacement) + s(horsepower) +
## s(weight) + s(acceleration) + s(year) + origin, data = df,
## subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.8945 -1.5559 0.1209 1.3163 11.8406
##
## (Dispersion Parameter for gaussian family taken to be 7.6361)
##
## Null Deviance: 12892.19 on 195 degrees of freedom
## Residual Deviance: 1298.132 on 169.9998 degrees of freedom
## AIC: 980.7754
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(cylinders) 1 8112.0 8112.0 1062.3291 < 2.2e-16 ***
## s(displacement) 1 829.5 829.5 108.6311 < 2.2e-16 ***
## s(horsepower) 1 163.5 163.5 21.4092 7.327e-06 ***
## s(weight) 1 140.6 140.6 18.4188 2.970e-05 ***
## s(acceleration) 1 62.3 62.3 8.1560 0.004827 **
## s(year) 1 1342.3 1342.3 175.7797 < 2.2e-16 ***
## origin 1 37.0 37.0 4.8393 0.029167 *
## Residuals 170 1298.1 7.6
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(cylinders) 3 8.8247 1.805e-05 ***
## s(displacement) 3 2.6158 0.0527740 .
## s(horsepower) 3 1.7774 0.1533438
## s(weight) 3 10.2654 3.014e-06 ***
## s(acceleration) 3 8.6317 2.299e-05 ***
## s(year) 3 7.4170 0.0001068 ***
## origin
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1, 3))
plot(gam.m, se=TRUE, col="blue")
yhat.gam <- predict(gam.m, newdata = df[-train, ])
mean((yhat.gam - auto.test)^2)
## [1] 6.470187
The MSE is 6.47. The “Anova for Parametric effects” p-values clearly demonstrate that all predictors are highly statistically significant, even when assuming a linear relationship. The “Anova for Nonparametrically effects” p-values for displacement and horsepower. The large p-value for these two predictors shows that a linear function is adequate for these variable.
I prefer regression tree model. For accuracy, its MSE is 9.3667 which is not too away from gam tree which is 6.47. I believe regression tree is the easiest one to interpret for all groups of people since it just simply split the data into two groups repeatedly to find the best classification.